Raw data quality control


Initialization

We start the analysis by initializing the packages required for all the analysis performed in this section. We also define the root directory, within which all the input/output operations for this project will be performed. At the end of this document, detailed software version information is provided for easier reproducibility of the analysis.


Raw data summary

We show below the summary for the raw screening data - Yeast Eessential DaMP library.

     Plate          Well_96           Well           Group      
 2      : 3888   A11    :  576   A41    :   36   AL     :  576  
 4      : 3840   D10    :  576   A42    :   36   AT     :  576  
 3      : 3792   D2     :  576   A43    :   36   K      :  576  
 9      : 3696   A1     :  528   A44    :   36   A      :  528  
 8      : 3360   D1     :  528   B41    :   36   AK     :  528  
 5      : 3216   D5     :  528   B42    :   36   AO     :  528  
 (Other):16032   (Other):34512   (Other):37608   (Other):34512  
         Content     SystematicName        SGD.ID          Gene.Symbol       
 Blank       :9456   Length:37824       Length:37824       Length:37824      
 Control     :9456   Class :character   Class :character   Class :character  
 Cytoplasm   :9456   Mode  :character   Mode  :character   Mode  :character  
 Mitochondria:9456                                                           
                                                                             
                                                                             
                                                                             
  roGFP2.ratio    Signal_485_520  Signal_400_520         Type      
 Min.   :-9.746   Min.   :3.510   Min.   :4.626   Glucose  :12608  
 1st Qu.: 0.469   1st Qu.:4.379   1st Qu.:4.963   Galactose:12608  
 Median : 0.568   Median :4.980   Median :5.088   Glycerol :12608  
 Mean   : 0.648   Mean   :4.768   Mean   :5.066                    
 3rd Qu.: 0.817   3rd Qu.:5.104   3rd Qu.:5.172                    
 Max.   : 6.500   Max.   :5.415   Max.   :5.415                    
 NA's   :9456                                                      

Background Fluorescence

Firstly, we look at the distribution of background flourescence across all plates and retain only those colonies for Control, Mitochondria and Cytoplasm whose fluorescence signal exceeds the 90th percentile of blank 485-520 and 400-520 fluorescence signals

tmpdat = rawDat[, c("Plate", "Content", "Signal_485_520", "Signal_400_520", "Type")]
tmpdat = melt(tmpdat)

a = droplevels(tmpdat[tmpdat$variable == "Signal_485_520" & tmpdat$Content == "Blank", 
    ])
per90_485_520 = lapply(split(a, a$Plate), function(x) split(x, x$Type))
per90_485_520 = sapply(per90_485_520, function(x) sapply(x, function(y) quantile(y$value, 
    0.9)))
per90_485_520 = melt(t(per90_485_520))
per90_485_520$Var2 = gsub(".90%", "", per90_485_520$Var2)
colnames(per90_485_520) = c("Plate", "Type", "value")

a = droplevels(tmpdat[tmpdat$variable == "Signal_400_520" & tmpdat$Content == "Blank", 
    ])
per90_400_520 = lapply(split(a, a$Plate), function(x) split(x, x$Type))
per90_400_520 = sapply(per90_400_520, function(x) sapply(x, function(y) quantile(y$value, 
    0.9)))
per90_400_520 = melt(t(per90_400_520))
per90_400_520$Var2 = gsub(".90%", "", per90_400_520$Var2)
colnames(per90_400_520) = c("Plate", "Type", "value")

# Filtering out colonies

if (identical(paste0(per90_400_520$Plate, per90_400_520$Type), paste0(per90_485_520$Plate, 
    per90_485_520$Type))) {
    cutoff = data.frame(per90_400_520[, 1:2], per90_400_520 = per90_400_520$value, 
        per90_485_520 = per90_485_520$value, stringsAsFactors = F)
}

rawDatFilt = lapply(split(rawDat, rawDat$Plate), function(x) split(x, x$Type))

for (i in 1:nrow(cutoff)) {
    x = rawDatFilt[[as.character(cutoff$Plate[i])]][[cutoff$Type[i]]]
    x = x[x$Signal_485_520 > cutoff$per90_485_520[i] & x$Signal_400_520 > cutoff$per90_400_520[i], 
        ]
    rawDatFilt[[as.character(cutoff$Plate[i])]][[cutoff$Type[i]]] = x
}

rawDatFilt = do.call("rbind", lapply(rawDatFilt, function(x) do.call("rbind", x)))

# Plot background fluorescence distributions
p1 = ggplot(tmpdat, aes(x = value, fill = Content)) + geom_density(alpha = 0.5) + 
    theme_bw(base_size = 8) + labs(subtitle = "Essential DAmP screen", x = "Fluorescence  signal (in log10)", 
    y = "Density", fill = "") + facet_wrap(variable ~ Type, scales = "free", ncol = 3) + 
    theme(legend.position = "right", panel.grid = element_blank())

p2 = ggplot(tmpdat[tmpdat$variable == "Signal_485_520", ], aes(x = value, fill = Content)) + 
    geom_density(alpha = 0.5) + theme_bw(base_size = 8) + labs(subtitle = "Essential DAmP screen | Signal_485_520", 
    x = "Fluorescence  signal (in log10)", y = "Density", fill = "") + geom_vline(data = per90_485_520, 
    aes(xintercept = value)) + facet_wrap(Plate ~ Type, scales = "free", ncol = 3) + 
    theme(legend.position = "right", panel.grid = element_blank())

p3 = ggplot(tmpdat[tmpdat$variable == "Signal_400_520", ], aes(x = value, fill = Content)) + 
    geom_density(alpha = 0.5) + theme_bw(base_size = 8) + labs(subtitle = "Essential DAmP screen | Signal_400_520", 
    x = "Fluorescence  signal (in log10)", y = "Density", fill = "") + geom_vline(data = per90_400_520, 
    aes(xintercept = value)) + facet_wrap(Plate ~ Type, scales = "free", ncol = 3) + 
    theme(legend.position = "right", panel.grid = element_blank())

ggsave(filename = paste0(path, "analysis/QC/background_fluorescence.pdf"), plot = p1, 
    width = 7, height = 3.5)
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence_Signal_485_520.pdf"), 
    plot = p2, width = 7, height = 15, limitsize = F)
ggsave(filename = paste0(path, "analysis/QC/background_fluorescence_Signal_400_520.pdf"), 
    plot = p3, width = 7, height = 15, limitsize = F)

p1

[1] "total number of colonies in the raw data - 37824"
[1] "total number of colonies in the background flourescence filtered data - 27541"
[1] "% of colonies KEPT - 72.81"

Cleaning raw data

Firstly we read the raw yeast redox screen data and remove outlier (extreme values) and NA. Typically, outlier removal is not considered good data analysis practice, however, we do it here because we know that the extreme values comes from technical artifacts. For example, roGFP2 ratios cannot be negative and those which are significantly higher than the plate average are ambiguous signals.

Outliers were detected as -

  • upper outlier values > 75th quantile value for a plate + 3 * plate IQR value
  • lower outlier values < 25th quantile value for a plate - 3 * plate IQR value

The outlier values are all set to NA and values greater than the lower bound threshold but less than 0 were set to pseudo minimum value of 0.0001 as roGFP2 ratios can’t be less than 0.

We show these extreme outliers below, ploting the distribution of values from each plate.

Next, lets check the summaries of the raw roGFP2 redox screen data and the cleaned raw data for comparison.

Raw data looked like -

     Plate          Well_96           Well           Group      
 2      : 2892   D10    :  451   A41    :   36   AT     :  451  
 4      : 2690   A11    :  447   A42    :   36   K      :  447  
 9      : 2682   H8     :  413   A43    :   36   CN     :  413  
 3      : 2595   G9     :  405   A44    :   36   CC     :  405  
 5      : 2462   D7     :  402   B41    :   36   AQ     :  402  
 8      : 2438   D12    :  394   B42    :   36   AV     :  394  
 (Other):11782   (Other):25029   (Other):27325   (Other):25029  
         Content     SystematicName        SGD.ID          Gene.Symbol       
 Blank       : 812   Length:27541       Length:27541       Length:27541      
 Control     :8671   Class :character   Class :character   Class :character  
 Cytoplasm   :8826   Mode  :character   Mode  :character   Mode  :character  
 Mitochondria:9232                                                           
                                                                             
                                                                             
                                                                             
  roGFP2.ratio     Signal_485_520  Signal_400_520         Type     
 Min.   :-2.4333   Min.   :3.922   Min.   :4.820   Glucose  :9347  
 1st Qu.: 0.4760   1st Qu.:4.949   1st Qu.:5.071   Galactose:8557  
 Median : 0.5752   Median :5.050   Median :5.134   Glycerol :9637  
 Mean   : 0.6558   Mean   :5.020   Mean   :5.137                   
 3rd Qu.: 0.8239   3rd Qu.:5.138   3rd Qu.:5.201                   
 Max.   : 3.1356   Max.   :5.415   Max.   :5.415                   
 NA's   :812                                                       

Cleaned data looked like -

     Plate          Well_96           Well           Group      
 2      : 2807   A11    :  432   A41    :   36   AT     :  432  
 9      : 2602   D10    :  432   A42    :   36   K      :  432  
 4      : 2601   D7     :  396   A43    :   36   AQ     :  396  
 3      : 2501   G9     :  396   A44    :   36   CC     :  396  
 5      : 2396   D5     :  382   B41    :   36   AO     :  382  
 8      : 2363   E6     :  365   B42    :   36   BB     :  365  
 (Other):11443   (Other):24310   (Other):26497   (Other):24310  
         Content     SystematicName        SGD.ID          Gene.Symbol       
 Control     :8670   Length:26713       Length:26713       Length:26713      
 Cytoplasm   :8825   Class :character   Class :character   Class :character  
 Mitochondria:9218   Mode  :character   Mode  :character   Mode  :character  
                                                                             
                                                                             
                                                                             
                                                                             
  roGFP2.ratio    Signal_485_520  Signal_400_520         Type     
 Min.   :0.0001   Min.   :4.449   Min.   :4.831   Glucose  :9059  
 1st Qu.:0.4760   1st Qu.:4.959   1st Qu.:5.076   Galactose:8287  
 Median :0.5751   Median :5.055   Median :5.137   Glycerol :9367  
 Mean   :0.6553   Mean   :5.045   Mean   :5.141                   
 3rd Qu.:0.8237   3rd Qu.:5.141   3rd Qu.:5.204                   
 Max.   :2.2561   Max.   :5.415   Max.   :5.415                   
                                                                  

Percentage of data removed after outlier removal -

[1] 3.01

Identification of edge effects

A well known technical issue with yeast mutant screens is the altered growth characteristics of yeast colonies growing at the edge of plates. Next, we look at how roGFP2 ratios are different between edge wells and non-edge wells.

These are the edge wells and the mutants originating from these locations should be carefully analyzed.

  [1] "I01" "Q01" "Y01" "A03" "D02" "F04" "I03" "L02" "N04" "Q03" "T02" "V04"
 [13] "Y03" "b02" "d04" "A09" "A17" "A25" "A33" "A41" "B05" "B13" "B21" "B29"
 [25] "B37" "B45" "C09" "C17" "C25" "C33" "C41" "D05" "D13" "D21" "D29" "D37"
 [37] "D45" "F45" "H45" "J45" "L45" "N45" "P45" "R45" "T45" "V45" "X45" "Z45"
 [49] "b05" "b13" "b21" "b29" "b37" "b45" "c09" "c17" "c25" "c33" "c41" "d05"
 [61] "d13" "d21" "d29" "d37" "d45" "e09" "e17" "e25" "e33" "e41" "J01" "R01"
 [73] "Z01" "A04" "D03" "G02" "I04" "L03" "O02" "Q04" "T03" "W02" "Y04" "b03"
 [85] "e02" "A10" "A18" "A26" "A34" "A42" "B06" "B14" "B22" "B30" "B38" "B46"
 [97] "C10" "C18" "C26" "C34" "C42" "D06" "D14" "D22" "D30" "D38" "D46" "F46"
[109] "H46" "J46" "L46" "N46" "P46" "R46" "T46" "V46" "X46" "Z46" "b06" "b14"
[121] "b22" "b30" "b38" "b46" "c10" "c18" "c26" "c34" "c42" "d06" "d14" "d22"
[133] "d30" "d38" "d46" "e10" "e18" "e26" "e34" "e42" "K01" "S01" "a01" "B02"
[145] "D04" "G03" "J02" "L04" "O03" "R02" "T04" "W03" "Z02" "b04" "e03" "A11"
[157] "A19" "A27" "A35" "A43" "B07" "B15" "B23" "B31" "B39" "B47" "C11" "C19"
[169] "C27" "C35" "C43" "D07" "D15" "D23" "D31" "D39" "D47" "F47" "H47" "J47"
[181] "L47" "N47" "P47" "R47" "T47" "V47" "X47" "Z47" "b07" "b15" "b23" "b31"
[193] "b39" "b47" "c11" "c19" "c27" "c35" "c43" "d07" "d15" "d23" "d31" "d39"
[205] "d47" "e11" "e19" "e27" "e35" "e43" "L01" "T01" "b01" "B03" "E02" "G04"
[217] "J03" "M02" "O04" "R03" "U02" "W04" "Z03" "c02" "e04" "A12" "A20" "A28"
[229] "A36" "A44" "B08" "B16" "B24" "B32" "B40" "B48" "C12" "C20" "C28" "C36"
[241] "C44" "D08" "D16" "D24" "D32" "D40" "D48" "F48" "H48" "J48" "L48" "N48"
[253] "P48" "R48" "T48" "V48" "X48" "Z48" "b08" "b16" "b24" "b32" "b40" "b48"
[265] "c12" "c20" "c28" "c36" "c44" "d08" "d16" "d24" "d32" "d40" "d48" "e12"
[277] "e20" "e28" "e36" "e44" "M01" "U01" "c01" "B04" "E03" "H02" "J04" "M03"
[289] "P02" "R04" "U03" "X02" "Z04" "c03" "A05" "A13" "A21" "A29" "A37" "A45"
[301] "B09" "B17" "B25" "B33" "B41" "C05" "C13" "C21" "C29" "C37" "C45" "D09"
[313] "D17" "D25" "D33" "D41" "E45" "G45" "I45" "K45" "M45" "O45" "Q45" "S45"
[325] "U45" "W45" "Y45" "a45" "b09" "b17" "b25" "b33" "b41" "c05" "c13" "c21"
[337] "c29" "c37" "c45" "d09" "d17" "d25" "d33" "d41" "e05" "e13" "e21" "e29"
[349] "e37" "e45" "N01" "V01" "d01" "C02" "E04" "H03" "K02" "M04" "P03" "S02"
[361] "U04" "X03" "a02" "c04" "A06" "A14" "A22" "A30" "A38" "A46" "B10" "B18"
[373] "B26" "B34" "B42" "C06" "C14" "C22" "C30" "C38" "C46" "D10" "D18" "D26"
[385] "D34" "D42" "E46" "G46" "I46" "K46" "M46" "O46" "Q46" "S46" "U46" "W46"
[397] "Y46" "a46" "b10" "b18" "b26" "b34" "b42" "c06" "c14" "c22" "c30" "c38"
[409] "c46" "d10" "d18" "d26" "d34" "d42" "e06" "e14" "e22" "e30" "e38" "e46"
[421] "O01" "W01" "e01" "C03" "F02" "H04" "K03" "N02" "P04" "S03" "V02" "X04"
[433] "a03" "d02" "A07" "A15" "A23" "A31" "A39" "A47" "B11" "B19" "B27" "B35"
[445] "B43" "C07" "C15" "C23" "C31" "C39" "C47" "D11" "D19" "D27" "D35" "D43"
[457] "D47" "G47" "I47" "K47" "M47" "O47" "Q47" "S47" "U47" "W47" "Y47" "a47"
[469] "b11" "b19" "b27" "b35" "b43" "c07" "c15" "c23" "c31" "c39" "c47" "d11"
[481] "d19" "d27" "d35" "d43" "e07" "e15" "e23" "e31" "e39" "e47" "P01" "X01"
[493] "A02" "C04" "F03" "I02" "K04" "N03" "Q02" "S04" "V03" "Y02" "a04" "d03"
[505] "A08" "A16" "A24" "A32" "A40" "A48" "B12" "B20" "B28" "B36" "B44" "C08"
[517] "C16" "C24" "C32" "C40" "C48" "D12" "D20" "D28" "D36" "D44" "D48" "G48"
[529] "I48" "K48" "M48" "O48" "Q48" "S48" "U48" "W48" "Y48" "a48" "b12" "b20"
[541] "b28" "b36" "b44" "c08" "c16" "c24" "c32" "c40" "c48" "d12" "d20" "d28"
[553] "d36" "d44" "e08" "e16" "e24" "e32" "e40" "e48"

Visulaizing the differences using both effect size (Fold change) and pvalue.

For none of the plates and conditions, the edge effects exceed the log fold change of +/- 1


Variance among controls

Next we look into the variance exhibited by the controls across plates and nutrient conditions. Ideally, all controls in the plate should have similar values i.e variance close to zero. Below we show the variance of raw roGFP2 ratios among controls per plate and group.

Next we remove all genes with variance among controls > 0.05

[1] "Number of poor control genes removed - 3"

Next we we visualize the fact that in a good screen, individual control values should not deviate much from the median value of all controls from that same plate. For a given plate \(i\), the normalized control values i.e \(NormCtrl\) is given as - \[NormCtrl_i = \frac{Ctrl_i} {median(Ctrl_i)}\] We plot these scaled control values per plate across conditions below. Ideally these values should be close to 1 (shown as red horizontal lines). A table for these scaled control values is also provided.

ctrlDat = split(rawDatCleaned_ctrlfilt, rawDatCleaned_ctrlfilt$Type)
ctrlDat = lapply(ctrlDat, function(x) {
    split(x, x$Plate)
})

for (i in 1:length(ctrlDat)) {
    for (j in 1:length(ctrlDat[[i]])) {
        tmp = droplevels(ctrlDat[[i]][[j]])
        tmp = tmp[, -2]
        
        # Plate control
        tmp.ctrl = tmp[tmp$Content == "Control", ]
        
        # Normalizing factor
        nf = median(tmp.ctrl$roGFP2.ratio, na.rm = T)
        
        # Measure control value deviation from plate median
        ctrlDat[[i]][[j]] = data.frame(Gene = tmp.ctrl$Gene.Symbol, NormControl = round(log2(tmp.ctrl$roGFP2.ratio/nf), 
            3), stringsAsFactors = F)
        # Deleting
        rm(tmp, tmp.ctrl, nf)
    }
    rm(j)
}
rm(i)

ctrlDat = melt(ctrlDat, id = "Gene")
ctrlDat = ctrlDat[, c(4, 5, 1, 3)]
colnames(ctrlDat) = c("Plate", "Nutrient", "Gene", "NormControl")
ctrlDat$Plate = as.factor(ctrlDat$Plate)
ctrlDat$Nutrient = factor(ctrlDat$Nutrient, levels = c("Glucose", "Galactose", "Glycerol"))

# datatable(ctrlDat, rownames = FALSE, filter='top', class='compact', extensions
# = c('Buttons') , options = list(autoWidth = TRUE, dom = 'Blfrtip', buttons =
# c('csv', 'excel') ))

p = ggplot(ctrlDat, aes(x = Plate, y = NormControl)) + theme_bw(base_size = 8) + 
    labs(y = "Control / Median plate control (roGFP2 ratios) in log2 scale") + geom_boxplot(outlier.size = 0.1, 
    lwd = 0.2) + facet_wrap(~Nutrient) + geom_hline(yintercept = 3, color = "orange") + 
    geom_hline(yintercept = -3, color = "orange") + # geom_text_repel(data = subset(ctrlDat, NormControl > 3), aes(x = Plate, y =
# NormControl, label = Gene), size = 2) +
theme(axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5))

p

Next we remove those controls that deviate 3 log fold units over the plate control median value

[1] "Number of control genes removed which deviated 3 log fold units from the plate median - 14"

Plotting these scaled control values -

A more standard way of calculating the variance among the controls is by computing the robust coefficient of variance see here. For a given plate \(i\), this is given as - \[CoefVar_i = \frac{mad(Ctrl_i)}{median(Ctrl_i)}\] mad is Median absolute deviation

Plotting these robust coefficient of variation for control values-


Saving the data

Finally we save the cleaned raw Data as a .RDS (R data object), which will be used for the next step of normalization

Session information

R version 3.6.2 (2019-12-12)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS  10.16

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] ggallin_0.1.1    WriteXLS_5.0.0   DT_0.12          plotly_4.9.2    
 [5] lsr_0.5          ggpubr_0.2.4     magrittr_1.5     ggrepel_0.8.1   
 [9] ggplot2_3.2.1    reshape2_1.4.3   rmdformats_0.3.6 knitr_1.28      

loaded via a namespace (and not attached):
 [1] tidyselect_1.0.0   xfun_0.12          purrr_0.3.3        colorspace_1.4-1  
 [5] vctrs_0.2.2        htmltools_0.4.0    viridisLite_0.3.0  yaml_2.2.1        
 [9] rlang_0.4.4        pillar_1.4.3       later_1.0.0        glue_1.3.1        
[13] withr_2.1.2        RColorBrewer_1.1-2 lifecycle_0.1.0    plyr_1.8.5        
[17] stringr_1.4.0      munsell_0.5.0      ggsignif_0.6.0     gtable_0.3.0      
[21] htmlwidgets_1.5.1  evaluate_0.14      labeling_0.3       fastmap_1.0.1     
[25] Cairo_1.5-11       httpuv_1.5.5       crosstalk_1.0.0    Rcpp_1.0.3        
[29] xtable_1.8-4       promises_1.1.0     scales_1.1.0       formatR_1.7       
[33] jsonlite_1.6.1     mime_0.9           farver_2.0.3       digest_0.6.23     
[37] stringi_1.4.5      bookdown_0.17      dplyr_0.8.4        shiny_1.4.0       
[41] grid_3.6.2         tools_3.6.2        lazyeval_0.2.2     tibble_2.1.3      
[45] crayon_1.3.4       tidyr_1.0.2        pkgconfig_2.0.3    ellipsis_0.3.0    
[49] data.table_1.12.8  assertthat_0.2.1   rmarkdown_2.1      httr_1.4.1        
[53] R6_2.4.1           compiler_3.6.2    

Ashwini Kumar Sharma, PhD

2021-06-18